# -*- coding: utf-8 -*-
"""
------------------------------------------------------------------------------
    File Name:  word2vec_demo
    Author   :  wanwei1029
    Date     :  2019-12-02 
    Desc     : 词向量的重要意义在于将自然语言转换成了计算机能够理解的向量。相对于词袋模型、TF-IDF等模型，词向量能抓住词的上下文、
    语义，衡量词与词的相似性，在文本分类、情感分析等许多自然语言处理领域有重要作用。
    w2v可以把语料库里每一个词，转换为相等大小的向量
    参考：https://blog.csdn.net/qq_27586341/article/details/90025288
        https://www.jianshu.com/p/b779f8219f74
------------------------------------------------------------------------------
"""
from gensim.models import Word2Vec
from samp.utils.file_utils import read_file_to_list
from samp.utils.string_utils import tokenize


def w2v_demo():
    data_file = "../data/test_data.txt"
    data_list = read_file_to_list(data_file)
    print("语料大小:{0}".format(len(data_list)))
    tokenize_list = tokenize(data_list)
    w2v_model = Word2Vec(tokenize_list, size=128, window=5, min_count=3, workers=4)
    print(w2v_model.wv['贷款'])


if __name__ == '__main__':
    test_method = "w2v_demo"
    if test_method == "w2v_demo":
        w2v_demo()